In [479]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge
In [ ]:
#data processing
#csv to pandas
exoDf = pd.read_csv('ExoplanetData.csv')
#remove HD from HD name column, redundant, also remove A
exoDf['hd_name'] = exoDf['hd_name'].str.extract('(\d+)', expand=False)
exoDf['hd_name'] = pd.to_numeric(exoDf['hd_name'], errors='coerce').fillna(exoDf['hd_name'])
exoDf['hd_name'] = exoDf['hd_name'].astype('Int64')
In [ ]:
hdDf = pd.read_csv('HDCatalogue.csv', sep=';')
In [76]:
#currently there are 121 different columns, lets reduce this to some usable ones.
#lets keep time discovered, name, plus anything that can be applied for regression,
#leaving out flags, discovery methods, etc.
#lets make a model that predicts based on our exoDf, the number of planets
#only keep necessary columns
#can make 3d plot of all observed exoplanets
columns_to_keep = ['pl_name', 'hostname', 'hd_name', 'hip_name', 'tic_id',
'gaia_id', 'sy_snum', 'sy_pnum', 'sy_mnum', 'pl_orbper',
'pl_orbsmax', 'pl_rade', 'pl_masse', 'pl_dens', 'pl_orbeccen',
'st_spectype', 'st_teff', 'st_rad', 'st_mass', 'st_met', 'st_lum',
'st_logg', 'st_age', 'st_dens', 'ra', 'dec', 'sy_dist', 'sy_plx',
'sy_bmag', 'sy_vmag', 'sy_kepmag']
exoDf = exoDf[columns_to_keep]
In [482]:
#make a 3d plot of all the stars? test this out
#converts degree in range 0-360 to a radian in range -pi to pi
def deg_from_neg_pi_to_pi(deg):
return (deg * np.pi / 180) - np.pi
#converts a degree in range -90 to 90 to a radian in range -pi/2 to pi/2
def deg_to_rad_90(deg):
return (deg * np.pi / 180)
raHd = hdDf['_RAJ2000'].apply(deg_from_neg_pi_to_pi)
raExo = exoDf['ra'].apply(deg_from_neg_pi_to_pi)
decHd = hdDf['_DEJ2000'].apply(deg_to_rad_90)
decExo = exoDf['dec'].apply(deg_to_rad_90)
# Set a higher DPI for better resolution
fig = plt.figure(figsize=(15, 10), dpi=200)
ax = plt.subplot(111, projection='aitoff')
# Initialize the plot with initial data
sc_hd = ax.scatter(raHd, decHd, s=1, c='blue', alpha=0.1)
sc_exo = ax.scatter(raExo, decExo, s=1, c='red', alpha=0.3)
ax.set_xlabel('Right Ascension')
ax.set_ylabel('Declination')
ax.set_title('Systems with Confirmed Exoplanets')
ax.grid(True)
# Update function for animation
def update(frame):
global sc_hd, sc_exo, sc_zero_line
# Shift in radians
shift_radians = np.deg2rad(frame) % (2 * np.pi)
# Update the RA for HD stars
new_ra_hd = ((raHd + shift_radians + np.pi) % (2 * np.pi)) - np.pi
# Update the RA for exoplanets
new_ra_exo = ((raExo + shift_radians + np.pi) % (2 * np.pi)) - np.pi
# Update the scatter plot data
sc_hd.set_offsets(np.column_stack((new_ra_hd, decHd)))
sc_exo.set_offsets(np.column_stack((new_ra_exo, decExo)))
return sc_hd, sc_exo
#uncomment if you want to make the animation!
ani = animation.FuncAnimation(fig, update, frames=np.arange(0, 360, 3), interval = 100, blit=True, repeat=True)
ani.save('stars_animation.mp4', writer='ffmpeg', dpi=200)
plt.show()
In [ ]:
#And the animation looks like this!

In [243]:
#lets make anothe graph of discovered exoplanets, with earth at the center
#cartesian
x = exoDf['sy_dist'] * np.cos(np.deg2rad(exoDf['dec'])) * np.cos(np.deg2rad(exoDf['ra']))
y = exoDf['sy_dist'] * np.cos(np.deg2rad(exoDf['dec'])) * np.sin(np.deg2rad(exoDf['ra']))
z = exoDf['sy_dist'] * np.sin(np.deg2rad(exoDf['dec']))
# Create a 3D scatter plot
fig = plt.figure(figsize=(12, 8), dpi=200)
ax = fig.add_subplot(111, projection='3d')
#plot
scatter = ax.scatter(x, y, z, c='red', marker='o', alpha=0.1)
# Set labels and title
ax.set_title('Cartesian plot of Confirmed Exoplanet Systems')
ax.set_xlim([-1500, 1500])
ax.set_ylim([-8000, 8000])
ax.set_zlim([-5000, 5000])
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_zticklabels([])
def update(frame):
ax.view_init(elev=10, azim=frame) # Rotate the view
return scatter,
#uncomment if you want to make the animation!
#ani = animation.FuncAnimation(fig, update, frames=np.arange(0, 360, 2), interval=100)
#ani.save('circular_3d_plot.gif', writer='pillow', dpi=200)
plt.show()
In [ ]:
#And the animation looks like this!

In [364]:
#lets make the model for the regression, visualize what we are working with
#relate spectral type and num of exoplanets
#extract just the spectral type symbol
exoDf['st_spectype'] = exoDf['st_spectype'].str[:2]
#group by spectype and find the mean of planets of that spectype
avg_exoplanets = exoDf.groupby('st_spectype')['sy_pnum'].mean()
#sort highest to lowest
avg_exoplanets_sorted = avg_exoplanets.sort_values(ascending=False)
plt.figure(figsize=(15, 10))
avg_exoplanets_sorted.plot(kind='bar', color='green')
plt.xlabel('Spectral Type')
plt.ylabel('Average Number of Exoplanets')
plt.title('Average Number of Exoplanets by Spectral Type')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()
In [415]:
#relate age with num of exoplanets
#mean num of exoplanets for each age
mean_exoplanets_by_age = exoDf.groupby('st_age')['sy_pnum'].mean()
slope, intercept, r_value, p_value, std_error = stats.linregress(mean_exoplanets_by_age.index, mean_exoplanets_by_age.values)
print("p value of: " + str(p_value))
plt.figure(figsize=(15, 10))
plt.scatter(mean_exoplanets_by_age.index.values, mean_exoplanets_by_age.values, marker='o', color='black', alpha=0.5)
plt.plot(mean_exoplanets_by_age.index.values,
intercept + slope * mean_exoplanets_by_age.index.values, color='red', linestyle='-',
linewidth=2)
plt.xlabel('Stellar Age (Billions of Years)')
plt.ylabel('Number of Exoplanets')
plt.title('Number of Exoplanets by Stellar Age')
plt.grid(True)
plt.tight_layout()
plt.show()
p value of: 0.009726610410438824
In [414]:
#relate stellar mass and number of exoplanets
# Filter the DataFrame to include only observations with stellar mass between 0 and 5
#solar masses, this eliminates outliers in our data
filtered_exoDf = exoDf[(exoDf['st_mass'] >= 0) & (exoDf['st_mass'] <= 5)]
#mean num of exoplanets for each stellar mass
mean_exoplanets_by_mass = filtered_exoDf.groupby('st_mass')['sy_pnum'].mean()
slope, intercept, r_value, p_value, std_error = stats.linregress(mean_exoplanets_by_mass.index, mean_exoplanets_by_mass.values)
print("p value of: " + str(p_value))
plt.figure(figsize=(15, 10))
plt.scatter(mean_exoplanets_by_mass.index.values, mean_exoplanets_by_mass.values, marker='o', color='black', alpha=0.5)
plt.plot(mean_exoplanets_by_mass.index.values,
intercept + slope * mean_exoplanets_by_mass.index.values, color='red', linestyle='-',
linewidth=2)
plt.xlabel('Stellar Mass (Solar Masses)')
plt.ylabel('Number of Exoplanets')
plt.title('Number of Exoplanets by Stellar Mass')
plt.grid(True)
plt.tight_layout()
plt.show()
p value of: 2.487538431838445e-17
In [413]:
#lets relate density to number of exoplanets
# Filter the DataFrame to include only observations with stellar density between 0 and 150 g/cm^3
#this eliminates outliers in our data
filtered_exoDf = exoDf[(exoDf['st_dens'] >= 0) & (exoDf['st_dens'] <= 150)]
#mean num of exoplanets for each stellar density
mean_exoplanets_by_density = filtered_exoDf.groupby('st_dens')['sy_pnum'].mean()
slope, intercept, r_value, p_value, std_error = stats.linregress(mean_exoplanets_by_density.index, mean_exoplanets_by_density.values)
print("p value of: " + str(p_value))
plt.figure(figsize=(15, 10))
plt.scatter(mean_exoplanets_by_density.index.values, mean_exoplanets_by_density.values, marker='o', color='black', alpha=0.5)
plt.plot(mean_exoplanets_by_density.index.values,
intercept + slope * mean_exoplanets_by_density.index.values, color='red', linestyle='-',
linewidth=2)
plt.xlabel('Stellar Density (g/cm^3)')
plt.ylabel('Number of Exoplanets')
plt.title('Number of Exoplanets by Stellar Density')
plt.grid(True)
plt.tight_layout()
plt.show()
p value of: 0.01653586935233488
In [416]:
#relate radius to num of exoplanets
#mean num of exoplanets for each stellar radii
mean_exoplanets_by_radius = exoDf.groupby('st_rad')['sy_pnum'].mean()
slope, intercept, r_value, p_value, std_error = stats.linregress(mean_exoplanets_by_radius.index, mean_exoplanets_by_radius.values)
print("p value of: " + str(p_value))
plt.figure(figsize=(15, 10))
plt.scatter(mean_exoplanets_by_radius.index.values, mean_exoplanets_by_radius.values, marker='o', color='black', alpha=0.5)
plt.plot(mean_exoplanets_by_radius.index.values,
intercept + slope * mean_exoplanets_by_radius.index.values, color='red', linestyle='-',
linewidth=2)
plt.xlabel('Stellar Radius (Solar Radii)')
plt.ylabel('Number of Exoplanets')
plt.title('Number of Exoplanets by Stellar Radius')
plt.grid(True)
plt.tight_layout()
plt.show()
p value of: 3.1861626015923364e-11
In [417]:
#relate metallicity to num of exoplanets
# Filter the DataFrame to include only observations with stellar metallicity between 0 and 1 dex
# this eliminates outliers in our data
filtered_exoDf = exoDf[(exoDf['st_met'] >= 0) & (exoDf['st_met'] <= 1)]
#mean num of exoplanets for each stellar metallicity
mean_exoplanets_by_met = filtered_exoDf.groupby('st_met')['sy_pnum'].mean()
slope, intercept, r_value, p_value, std_error = stats.linregress(mean_exoplanets_by_met.index, mean_exoplanets_by_met.values)
print("p value of: " + str(p_value))
plt.figure(figsize=(15, 10))
plt.scatter(mean_exoplanets_by_met.index.values, mean_exoplanets_by_met.values, marker='o', color='black', alpha=0.5)
plt.plot(mean_exoplanets_by_met.index.values,
intercept + slope * mean_exoplanets_by_met.index.values, color='red', linestyle='-',
linewidth=2)
plt.xlabel('Stellar Metallicity (dex)')
plt.ylabel('Number of Exoplanets')
plt.title('Number of Exoplanets by Stellar Metallicity')
plt.grid(True)
plt.tight_layout()
plt.show()
'''Note this p value is above standard 0.05, so the null hypothesis is not rejected
lets avoid using it in our regression''';
p value of: 0.5358958010334814
In [418]:
#relate effective temperature to number of exoplanets
# Filter the DataFrame to include only observations with stellar effective temperature
# between 0 and 10000 K. this eliminates outliers in our data
filtered_exoDf = exoDf[(exoDf['st_teff'] >= 0) & (exoDf['st_teff'] <= 10000)]
#mean num of exoplanets for each stellar effective temperature
mean_exoplanets_by_teff = filtered_exoDf.groupby('st_teff')['sy_pnum'].mean()
slope, intercept, r_value, p_value, std_error = stats.linregress(mean_exoplanets_by_teff.index, mean_exoplanets_by_teff.values)
print("p value of: " + str(p_value))
plt.figure(figsize=(15, 10))
plt.scatter(mean_exoplanets_by_teff.index.values, mean_exoplanets_by_teff.values, marker='o', color='black', alpha=0.5)
plt.plot(mean_exoplanets_by_teff.index.values,
intercept + slope * mean_exoplanets_by_teff.index.values, color='red', linestyle='-',
linewidth=2)
plt.xlabel('Stellar Effective Temperature (K)')
plt.ylabel('Number of Exoplanets')
plt.title('Number of Exoplanets by Stellar Effective Temperature')
plt.grid(True)
plt.tight_layout()
plt.show()
p value of: 2.0631232108630326e-13
In [419]:
#relate surface gravity to num of exoplanets
# Filter the DataFrame to include only observations with stellar metlalicity between 0 and 6
# log10(cm/s^2) surface gravity. this eliminates outliers in our data
filtered_exoDf = exoDf[(exoDf['st_logg'] >= -2) & (exoDf['st_logg'] <= 6)]
#mean num of exoplanets for each log10(cm/s^2) surface gravity
mean_exoplanets_by_logg = filtered_exoDf.groupby('st_logg')['sy_pnum'].mean()
slope, intercept, r_value, p_value, std_error = stats.linregress(mean_exoplanets_by_logg.index, mean_exoplanets_by_logg.values)
print("p value of: " + str(p_value))
plt.figure(figsize=(15, 10))
plt.scatter(mean_exoplanets_by_logg.index.values, mean_exoplanets_by_logg.values, marker='o', color='black', alpha=0.5)
plt.plot(mean_exoplanets_by_logg.index.values,
intercept + slope * mean_exoplanets_by_logg.index.values, color='red', linestyle='-',
linewidth=2)
plt.xlabel('Stellar Surface Gravity (log10(cm/s^2))')
plt.ylabel('Number of Exoplanets')
plt.title('Number of Exoplanets by Stellar Surface Gravity')
plt.grid(True)
plt.tight_layout()
plt.show()
p value of: 8.664601243134237e-29
In [454]:
#lets apply these 6 terms (stellar age, stellar mass, stellar density,
# stellar radius, stellar effective temperature, stellar surface gravity) to a
# regression model, so we can predict the number of exoplanets in a given system
#first, lets do mean imputation on those columns, to get rid of na
columns_to_impute = ['st_age', 'st_mass', 'st_dens', 'st_rad', 'st_teff', 'st_logg']
for column in columns_to_impute:
exoDf[column] = exoDf[column].fillna(exoDf[column].mean())
#define features and target
X = exoDf[['st_age', 'st_mass', 'st_dens', 'st_rad', 'st_teff', 'st_logg']]
y = exoDf['sy_pnum']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
# Predict the number of exoplanets for the test set
y_pred = model.predict(X_test)
# Calculate and print metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
# Plot the results
plt.figure(figsize=(15, 10))
plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
plt.xlabel('Actual Number of Exoplanets')
plt.ylabel('Predicted Number of Exoplanets')
plt.title('Actual vs Predicted Number of Exoplanets')
plt.grid(True)
plt.show()
Mean Squared Error: 1.4288107134911496 R-squared: 0.016702616477324184
In [474]:
# Define the cost function (mean squared error)
def compute_cost(X, y, theta):
m = len(y)
predictions = X.dot(theta)
cost = np.sum(np.square(predictions - y)) / (2 * m)
return cost
# Gradient descent function
def gradient_descent(X, y, theta, alpha, num_iterations):
m = len(y)
cost_history = []
for i in range(num_iterations):
# Calculate the predictions
predictions = X.dot(theta)
# Calculate the error
error = predictions - y
# Calculate the gradient
gradient = X.T.dot(error) / m
# Update parameters
theta -= alpha * gradient
# Compute the cost
cost = compute_cost(X, y, theta)
cost_history.append(cost)
# Print cost every 100 iterations
#if i % 100 == 0:
#print(f"Iteration {i}: Cost = {cost}")
return theta, cost_history
# Add a column of ones to X for the intercept term
X_train_with_intercept = np.c_[np.ones((X_train.shape[0], 1)), X_train]
# Initialize parameters (weights)
theta = np.zeros(X_train_with_intercept.shape[1])
# Set hyperparameters
alpha = 0.00000001
num_iterations = 10000
# Run gradient descent
theta, cost_history = gradient_descent(X_train_with_intercept, y_train, theta, alpha, num_iterations)
# Print final parameters
print("Final Parameters:", theta)
# Add a column of ones to X_test for the intercept term
X_test_with_intercept = np.c_[np.ones((X_test.shape[0], 1)), X_test]
# Predict using the trained model
y_pred = X_test_with_intercept.dot(theta)
# Calculate Mean Squared Error (MSE)
mse = np.mean((y_test - y_pred) ** 2)
# Plot the actual vs predicted values
'''
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red') # Add a diagonal line
plt.xlabel('Actual Number of Exoplanets')
plt.ylabel('Predicted Number of Exoplanets')
plt.title('Actual vs Predicted Number of Exoplanets (Gradient Descent)')
plt.grid(True)
plt.show() '''
# Print MSE
print("Mean Squared Error:", mse)
# Plot loss curve
plt.plot(range(len(cost_history)), cost_history)
plt.xlabel('Epoch')
plt.ylabel('Mean Squared Error')
plt.title('Loss Curve')
plt.show()
Final Parameters: [ 7.92327467e-06 4.83185211e-05 5.02741468e-07 4.25230969e-05 -5.30444439e-06 3.29997589e-04 3.97050269e-05] Mean Squared Error: 1.5937838075520914